{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Financial News - Preprocessing for word2vec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See notebook [lda_financial_news](../15_topic_modeling/07_financial_news/lda_financial_news.ipynb) for download instructions."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:46:36.726708Z",
     "start_time": "2020-06-21T02:46:35.497658Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "import os, tarfile, sys, json\n",
    "from pathlib import Path\n",
    "from time import time\n",
    "from pprint import pprint\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "from numpy.random import choice\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "import spacy\n",
    "from spacy.lang.en import English\n",
    "\n",
    "from gensim.models.word2vec import LineSentence\n",
    "from gensim.models.phrases import Phrases, Phraser"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:46:36.730759Z",
     "start_time": "2020-06-21T02:46:36.728045Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "pd.set_option('float_format', '{:,.2f}'.format)\n",
    "sns.set_style('white')\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:46:36.748705Z",
     "start_time": "2020-06-21T02:46:36.732050Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "def format_time(t):\n",
    "    m, s = divmod(t, 60)\n",
    "    h, m = divmod(m, 60)\n",
    "    return '{:02.0f}:{:02.0f}:{:02.0f}'.format(h, m, s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:46:36.937497Z",
     "start_time": "2020-06-21T02:46:36.750140Z"
    }
   },
   "outputs": [],
   "source": [
    "stop_words = set(pd.read_csv('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words',\n",
    "                             header=None,\n",
    "                             squeeze=True).tolist())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:46:36.961172Z",
     "start_time": "2020-06-21T02:46:36.946701Z"
    }
   },
   "outputs": [],
   "source": [
    "data_path = Path('..', 'data', 'us-financial-news')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:46:36.942125Z",
     "start_time": "2020-06-21T02:46:36.938723Z"
    }
   },
   "outputs": [],
   "source": [
    "results_path = Path('results', 'financial_news')\n",
    "if not results_path.exists():\n",
    "    results_path.mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "article_path = results_path / 'articles.txt'\n",
    "clean_article_path = results_path / 'articles_clean.txt'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:46:36.973821Z",
     "start_time": "2020-06-21T02:46:36.969980Z"
    }
   },
   "outputs": [],
   "source": [
    "section_titles = ['Press Releases - CNBC',\n",
    "                  'Reuters: Company News',\n",
    "                  'Reuters: World News',\n",
    "                  'Reuters: Business News',\n",
    "                  'Reuters: Financial Services and Real Estate',\n",
    "                  'Top News and Analysis (pro)',\n",
    "                  'Reuters: Top News',\n",
    "                  'The Wall Street Journal &amp; Breaking News, Business, Financial and Economic News, World News and Video',\n",
    "                  'Business &amp; Financial News, U.S &amp; International Breaking News | Reuters',\n",
    "                  'Reuters: Money News', 'Reuters: Technology News']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:49:00.283007Z",
     "start_time": "2020-06-21T02:46:36.980900Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done loading 125,964 articles in 00:02:23\n"
     ]
    }
   ],
   "source": [
    "articles = []\n",
    "counter = Counter()\n",
    "start = time()\n",
    "for f in fin_news_path.glob('*/**/*.json'):\n",
    "    article = json.load(f.open())\n",
    "    if article['thread']['section_title'] in set(section_titles):\n",
    "        text = article['text'].lower().split()\n",
    "        counter.update(text)\n",
    "        articles.append(' '.join([t for t in text if t not in stop_words]))\n",
    "\n",
    "print(f'Done loading {len(articles):,.0f} articles in {format_time(time()-start)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:49:01.517103Z",
     "start_time": "2020-06-21T02:49:00.290315Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "418422851"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "article_path.write_text('\\n'.join(articles))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Clean Financial News Articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:49:04.372584Z",
     "start_time": "2020-06-21T02:49:01.523389Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "125964"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "articles = article_path.read_text().split('\\n')\n",
    "len(articles)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sentence Boundary Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:49:04.379501Z",
     "start_time": "2020-06-21T02:49:04.374311Z"
    }
   },
   "outputs": [],
   "source": [
    "def clean_doc(d):\n",
    "    doc = []\n",
    "    for sent in d.sents:\n",
    "        s = [t.text.lower() for t in sent if not\n",
    "        any([t.is_digit, not t.is_alpha, t.is_punct, t.is_space])]\n",
    "        if len(s) > 5 or len(sent) < 100:\n",
    "            doc.append(' '.join(s))\n",
    "    return doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:49:04.718175Z",
     "start_time": "2020-06-21T02:49:04.383194Z"
    }
   },
   "outputs": [],
   "source": [
    "nlp = English()\n",
    "sentencizer = nlp.create_pipe(\"sentencizer\")\n",
    "nlp.add_pipe(sentencizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:58:54.132054Z",
     "start_time": "2020-06-21T02:49:04.719495Z"
    }
   },
   "outputs": [],
   "source": [
    "clean_articles = []\n",
    "iter_articles = (article for article in articles)\n",
    "for i, doc in enumerate(nlp.pipe(iter_articles, batch_size=100, n_process=8), 1):\n",
    "    if i % int(len(articles) / 100) + 1 == 0:\n",
    "        print(f'{i / len(articles):.1%}', end=' ', flush=True)\n",
    "    clean_articles.extend(clean_doc(doc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:58:55.038684Z",
     "start_time": "2020-06-21T02:58:54.134275Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "356650931"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_article_path.write_text('\\n'.join(clean_articles))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Corpus Stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:58:55.043613Z",
     "start_time": "2020-06-21T02:58:55.039660Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2986105"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(clean_articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:59:05.320049Z",
     "start_time": "2020-06-21T02:58:55.044890Z"
    }
   },
   "outputs": [],
   "source": [
    "vocab = Counter()\n",
    "sent_length = []\n",
    "for sentence in clean_articles:\n",
    "    tokens = sentence.lower().split()\n",
    "    sent_length.append(len(tokens))\n",
    "    vocab.update(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:59:05.323307Z",
     "start_time": "2020-06-21T02:59:05.320947Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2986105"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sent_length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:59:05.819285Z",
     "start_time": "2020-06-21T02:59:05.324574Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count   2,986,105.00\n",
       "mean           15.36\n",
       "std            20.68\n",
       "min             0.00\n",
       "10%             4.00\n",
       "20%             6.00\n",
       "30%             8.00\n",
       "40%            10.00\n",
       "50%            12.00\n",
       "60%            14.00\n",
       "70%            17.00\n",
       "80%            20.00\n",
       "90%            25.00\n",
       "max         6,910.00\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(sent_length).describe(percentiles=np.arange(.1, 1, .1).round(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:59:06.117128Z",
     "start_time": "2020-06-21T02:59:05.820424Z"
    }
   },
   "outputs": [],
   "source": [
    "most_common = (pd.DataFrame(vocab.most_common(), columns=['token', 'count'])\n",
    "               .pipe(lambda x: x[~x.token.str.lower().isin(stop_words)]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:59:06.404028Z",
     "start_time": "2020-06-21T02:59:06.118008Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2gAAAFyCAYAAACAxPnpAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOzde1RVZf7H8c85XEQCAsEURRIhL4OpeVlWRpNlFzEyz6ipiVqZY6WliAleyhukhmliZql4T9E4NpqTOdpMmpVhVhqVkpfE0QESZhDowOFwfn808cvxgqJyDvB+/SVn7/08371Xa7U+69n7+xjsdrtdAAAAAACHMzq6AAAAAADArwhoAAAAAOAkCGgAAAAA4CQIaAAAAADgJAhoAAAAAOAkCGgAAAAA4CQIaACA6+LkyZNq1aqVBg8efN6xuLg4tWrVSnl5eVUa+8CBA3rppZcuenz//v166qmn1Lt3b0VFRWnEiBE6fPhwleZypJkzZ6p3797q3bu32rZtqwcffLDib4vFcsFroqOjtW3btmquFABwrbg6ugAAQO1Vr149HTt2TP/85z/VtGlTSVJxcbH2799/VeP++OOPys7OvuCx9PR0jR8/XgsXLlTbtm0lSZs3b1Z0dLQ++OADNWjQ4Krmrk6TJ0+u+Pe9996rpKQk3XrrrQ6sCABwvRHQAADXjYuLi3r27KktW7Zo5MiRkqTt27frvvvuU0pKSsV5qampWr16tYxGowICAjRlyhSFhIRo3759mjVrlsrLyyVJf/7zn9WuXTstWLBAZ8+eVXx8vF555ZVz5lywYIGeffbZinAmSY888ojq1asnm812yfni4uLk4eGhw4cP68yZM7r33nvl6+urv//978rNzdXMmTN1xx13XPZ5Z8+e1bRp0/TDDz/IYDAoIiJCMTExcnV11a233qoRI0Zoz549ysnJ0fDhwzVo0KDLfrZvvPGGtm7dKhcXF4WEhGjKlClq2LBhxfGysjKNGzdOrq6umj17tn755RclJCTo8OHDslqtuuOOO/Tiiy9espbc3FxNmDBB+fn5kqQ//vGPGjNmzBX+VwAAuBK84ggAuK4effRR/eUvf6n4+7333lOfPn0q/v7ss8+0dOlSrVq1Sps3b9bDDz+s5557Tna7XcnJyXriiSdkNpuVmJiozz//XIGBgXr++efVuXPn88KZJH377bfq2LHjeb8/+OCDatiw4SXnk6TvvvtOK1eu1Jo1a5SSkiJPT0+tX79eQ4YM0ZIlSyrGu5zzZs6cKV9fX23ZskVpaWk6dOhQRTAtLS2Vn5+f1q9frwULFuiVV15RSUnJZT3TtLQ07d69W++++662bNmiW265RXFxcRXHrVarXnjhBfn7+yspKUmurq5KTExUeHi4zGaz3nvvPeXn52v58uWXrGXDhg0KCgrSpk2btHbtWv300086e/bsZdUIAKgaVtAAANdV27Zt5eLiom+//Vb+/v4qKipSy5YtK47v3r1bkZGRFa8emkwmJSQk6OTJk+rZs6emT5+ujz76SHfeeadiYmIqnc9oNFasuF3IpeaTpO7du8vNzU0NGzaUp6enIiIiJEnBwcH697//XTHO5Zy3a9curVu3TgaDQe7u7howYIBWrlypESNGSJLuu+8+SVJ4eLhKS0tVXFysevXqVXqPu3btkslkkqenpyRpyJAhWrx4sUpLSyVJs2fPVlFRkf72t7/JYDBIkv7xj3/o4MGDevfddyXpvG/YLlRLRESERowYodOnT+vOO+/UuHHj5O3tXWl9AICqI6ABAK67Rx55RJs3b1aDBg3Uu3fvc45dKEzZ7XaVlZVpwIAB6t69u/bs2aPdu3dr4cKFlTbA6NChg7755ptzQqAkTZs2Tffff/8l55Mkd3f3c465ul74f5WXc155eXlFQPrt79/mkVQRxn4757dVvMpUNu4jjzwiu92uyZMna/HixRXnvP766woNDZUkFRQUnDPGhWpp166ddu7cqc8++0yff/65+vXrpyVLlpzz+igA4NriFUcAwHXXu3dvbdu2TX/961/18MMPn3MsIiJCf/3rXys6OqalpcnX11c333yzBgwYoO+//14mk0kzZsxQQUGBcnNz5eLick4g+b1nnnlGCxcu1Lffflvxm9ls1ocffqiWLVtecr5r7a677tKaNWtkt9tVWlqqDRs26M4777zqcSMiIpSWlqbi4mJJ0urVq9WlS5eK0NiuXTuNGTNGJ06c0IYNGypqWbFiRUUtzzzzjNasWXPJeZKSkrRo0SL16NFDkyZNUlhYmDIzM6+6fgDAxbGCBgC47ho1aqTQ0FB5e3vL19f3nGPdunXTsGHDNHToUJWXl6tBgwZ66623ZDQaFRsbq8TERM2fP18Gg0GjRo1SUFCQbDab3njjDY0aNUoLFy48Z7zOnTtr5syZSkhIUHFxsaxWq4KDg7Vq1SoFBAQoICDgovNda5MnT9bMmTMVFRUlq9WqiIiIimYpV6Nv3746ffq0+vXrp/Lyct18881KSko655x69epp1qxZevLJJ3X77bdr0qRJSkhIqKjlzjvv1PDhwy85z9ChQxUXF6eHH35Y7u7uatWqlXr16nXV9QMALs5gv9z3KQAAAAAA1xWvOAIAAACAkyCgAQAAAICTIKABAAAAgJMgoAEAAACAkyCgAQAAAICToM1+FXTt2lVNmzZ1dBkAAAAAnNQ///lP7d2794qvI6BVQdOmTWU2mx1dBgAAAAAnZTKZqnQdrzgCAAAAgJMgoFUBW3sDAAAAzsNitTm6hGuGVxyrwGCQmsdtdXQZAAAAACQdn9XL0SVcMw4JaBaLRfHx8Tp16pSsVqsmTpyo1NRUZWVlyWaz6YknnlBkZKSio6PVqlUrZWZmytPTU507d9Ynn3yigoICpaSkaOfOndq5c6cKCwuVn5+v5557Tg8++KC2bdumtWvXVsz3+uuvKzMzU0uWLJGbm5tOnjypyMhI/fnPf9aDDz6ojRs3ytfXV++8846Ki4s1fPhwRzwWAAAAAHWcQ15xXL9+vZo2barU1FTNmjVLX3zxhfz8/LR+/XotX75c8+fPV15eniSpXbt2WrlypUpLS+Xh4aHly5crLCxM6enpkqTi4mItX75cKSkpmjVrlsrKynT8+HG9/fbbWr16tUJCQvTJJ59Ikk6dOqXk5GSlpqZq6dKlMhqNioqK0tatv66Gbd68WY8++qgjHgkAAAAAOCagHT16VB06dJAktWzZUrm5uerSpYskycvLS6GhocrKypIkhYeHS5J8fHwUFhZW8e+SkhJJUpcuXWQ0GhUQECAfHx/l5eXJ399fEyZMUHx8vA4dOqSysrKKuVxdXeXp6SkPDw9JUt++fbV582YdPnxYAQEBCggIqL4HAQAAAAC/45CAFhoaqoMHD0qSsrKytHXrVu3bt0+SVFhYqMOHDysoKOiyxsrIyJAk/fzzzyosLFT9+vW1YMECzZs3TzNnzlS9evVk/29XD4PBcN71TZo0kbe3txYvXqy+fftei9sDAAAAgCpxyDdoAwYM0MSJEzV48GDZbDYtXbpUa9eu1cCBA1VSUqJRo0bJ39//ssb6+eefNXToUJ09e1Yvv/yyvLy81LFjR/Xp00eenp7y8fFRTk7OJQNf//79NXPmTL366quXNafdXrs+RAQAAABqMovVJg83F0eXcU0Y7Paa2zTebDbr6NGjio2Nvapx/vrXvyozM1MvvPDCZZ1vMpnYqBoAAADARVU1M9T5Nvuvvfaa9u3bp0WLFjm6FAAAAAB1XI0OaCaT6arHiImJuQaVAAAAAMDVc0iTEAAAAADA+QhoAAAAAOAkCGgAAAAA4CRqTUAbNWqUJCk6OlpHjhxRcnKy1q1bp++//14LFy68pnPV3L6XAAAAgHOyWG2OLsEp1OgmIb93sRDWpk0btWnT5prOZTBIzeO2XtMxAQAAgLqMfYZ/VWMCmtls1t///ndZLBbl5uZqyJAh2rlzpzIzM/Xiiy/q5Zdf1p49e867bu/evVq/fr3mzZunzZs3a+XKlXJ3d1fz5s01ffp0bdmyRR9//LEsFotOnDihp59++pp0hwQAAACAK1VjApokFRUVKSUlRVu3btWKFSu0YcMG7d27V6tWrar02vz8fCUnJ2vTpk3y8vJSYmKiUlNT5enpqcLCQi1btkzHjx/XyJEjCWgAAAAAHKJGfYP226uK3t7eCg0NlcFg0I033qiSkpJKr83KylJYWJi8vLwkSV26dFFmZqYkqXXr1pKkwMBAlZaWXqfqAQAAAODSalRAMxgMVb42KChIR44cUXFxsSTpiy++UEhIyFWPCwAAAADXSo16xfFqNGjQQKNHj9aQIUNkNBoVHBys2NhYbd165c0+7HY+YgQAAACuJYvVJg83F0eX4XAGu52m8VfKZDLJbDY7ugwAAAAATqqqmaFGveIIAAAAALUZAQ0AAAAAnAQBDQAAAACcBAENAAAAAJwEAQ0AAAAAnAQB7XfWrFlzWefR9xIAAKDmsVhtji4BqFSd2Qftcrz55psaPHhwpecZDFLzuCvfPw0AAACOwz62qAlqfUAzm836+OOPZbFYdOLECT399NMKDw/XzJkzJUm+vr5KTEzUmjVr9J///EdTp07V1KlTHVs0AAAAgDqp1gc0SSosLNSyZct0/PhxjRw5Uj4+PkpMTFRYWJg2btyopUuXauzYsVqzZg3hDAAAAIDD1ImA1rp1a0lSYGCgSktLdeTIEU2bNk2SZLVaFRIS4sjyAAAAAEBSHQloBoPhnL9DQkI0e/ZsNWnSRF9++aVyc3MlSXa6fwAAAABwoDoR0P7X1KlTNWHCBNlsv3bySUhIkCSFhoYqNjZWSUlJjiwPAAAAQB1lsLNsdMX69DFp0yazo8sAAADAFbBYbfJwc3F0GagjTCaTzOYrzwzsg1YF//PGJAAAAGoAwhlqAgIaAAAAADgJAhoAAAAAOAkCGgAAAAA4CQIaAAAAADiJOhnQ3n77bR04cOCc30pKSnTvvfde1vX0vQQAAHWZxWpzdAlArVUn90EbMWLEVV1vMEjN47Zeo2oAAABqluOzejm6BKDWqlUB7dixY4qPj5erq6tcXFw0Z84cLViwQP/617+Un5+vu+++W2PGjFFcXJwiIyPVqVMnxcbGqqCgQMHBwY4uHwAAAEAdV6sC2qeffqrw8HDFxcVp3759+s9//qMOHTqoX79+KikpqQhov9m0aZNatmypsWPH6ptvvtHevXsdWD0AAACAuq5WBbS+fftqyZIlGj58uLy9vTVq1CgdPHhQn3/+uby8vFRaWnrO+ZmZmYqIiJAktW/fXq6utepxAAAAAKhhalWTkJ07d6pTp05auXKlHnroIfXu3Vve3t6aO3eunnzySVksFtl/1+GjRYsW+vrrryVJ3333ncrKyhxVOgAAAADUrhW0tm3bavz48UpOTpbRaNQ777yjqVOn6ssvv1T9+vV18803Kycnp+L8xx9/XPHx8Ro4cKBatGghNzc3B1YPAAAAoK4z2O00jb9SffqYtGmT2dFlAAAAOITFapOHm4ujywCcmslkktl85ZmhVr3iWF0MBkdXAAAA4DiEM+D6IaABAAAAgJMgoAEAAACAkyCgAQAAAICTIKABAAAAgJNw2oC2a9cupaamOrqMC6LvJQAAcDSL1eboEgBcB067D9rdd9/t6BIuymCQmsdtdXQZAACgDjs+q5ejSwBwHThtQDObzdq9e7dOnTqlxo0bKysrS7feequmTZumM2fOKC4uTmfPnpXdbtfs2bPVoEEDjR8/XoWFhbLZbHrhhRd0xx13KCoqSp07d9bhw4cVEhIif39/7du3T+7u7nr77bdlsVg0adIk5efnS5ImT56sVq1aOfjuAQAAANRFThvQfnP8+HEtW7ZM9evXV48ePZSbm6u33npL9957rwYOHKjPPvtMBw4c0Pfff68777xTQ4cOVXZ2tgYOHKgdO3aoqKhIDz/8sDp16qSHHnpI8fHxGjt2rAYPHqwff/xR77//vm6//XYNGjRIx48fV3x8vNatW+fo2wYAAABQBzl9QAsODpaXl5ckqWHDhiopKdGxY8fUt29fSdIdd9whSXr//fcVFRUlSWrUqJG8vLyUl5cnSQoPD5ck+fj4KDQ0tOLfJSUlOnz4sD7//HN98MEHkqSCgoLquzkAAAAA+B2nD2gGg+G830JDQ3Xw4EG1bt1a6enp+sc//qHQ0FDt27dPf/jDH5Sdna2CggL5+vpedIzftGjRQo888oiioqJ05swZbdy48brdCwAAAABcitMHtAsZOXKkJk6cqM2bN0uSEhMT5e3trYkTJ+rDDz+UxWLR9OnT5epa+e2NHDlSkyZN0oYNG1RYWKhRo0Zd7/IBAAAA4IIMdjtN469Unz4mbdpkdnQZAACgDrNYbfJwc3F0GQAuwmQyyWy+8szgtPugObNLvDEJAABQLQhnQO1EQAMAAAAAJ0FAAwAAAAAnQUADAAAAACdBQAMAAAAAJ0FAqwL6XgIAUHNYrDZHlwAAl61G7oPmaAaD1Dxuq6PLAAAAl+H4rF6OLgEALlutCWjjxo1TVFSU7rnnHh05ckSzZ89WQECAfvrpJ5WXl2vMmDHq2rWrtm3bprVr11Zc9/rrryszM1NJSUlyc3NT//799eijjzrwTgAAAADUVbUmoPXr10/r1q3TPffco3fffVe33XabCgsLlZiYqPz8fA0ePFhbt27V8ePH9fbbb6t+/fp66aWX9Mknn6hRo0YqKSnRxo0bHX0bAAAAAOqwWhPQunbtqoSEBJ05c0Z79uzRbbfdpv379+vAgQOSpLKyMuXn58vf318TJkzQDTfcoKNHj6pDhw6SpJCQEEeWDwAAAAC1J6AZDAZFRUUpISFB3bp1U2BgoAIDAzVy5EhZLBa9+eabcnV11YIFC/SPf/xDkvTEE0/I/t+OH0Yj/VIAAAAAOFatCWiSZDKZdM899+gvf/mLmjVrpsmTJ2vw4MEqLCzUoEGD5OXlpY4dO6pPnz7y9PSUj4+PcnJyFBQU5OjSAQAAAKB2BTSbzaZOnTopNDRUkjRnzpzzznn99dcveG3Xrl0vex67nY5QAADUFBarTR5uLo4uAwAuS615r+/DDz/U8OHDNW7cuOs+l8Fw3acAAADXCOEMQE1Sa1bQHnzwQT344IOOLgMAAAAAqqzWrKABAAAAQE1HQAMAAAAAJ0FAAwAAAAAnQUCrgv9unQYAACphsdocXQIA1Ci1pklIZVJTU2UymeTm5nbVYxkMUvO4rdegKgAAaje2pQGAK1NnVtDeeustlZeXO7oMAAAAALgop11BKyoq0rhx41RQUKCwsDB99dVX8vX11dSpUxUaGqp169bp559/1ujRozV37lx9++23KioqUmhoqF555RUlJyfrq6++UnFxsaKiopSbm6uxY8dq0aJFmjt3rtLT02W32zVs2DD17NlT0dHR8vPzU0FBgZYtWyYXF/ZMAQAAAFC9nDagvfPOO2rVqpXGjh2r/fv365NPPpGvr+955xUWFsrHx0fLly9XeXm5evXqpezsbElSixYtNHnyZEnSsmXLNG/ePH388cc6efKk1q9fr5KSEvXv31/dunWTJEVFRen++++vvpsEAAAAgN9x2oB28uRJRURESJI6duwod3f3c47b/9upo169esrLy1NMTIw8PT1VXFwsq9UqSQoJCTlv3MOHDysjI0PR0dGSpLKyMp06deqi5wMAAABAdXHab9BatWql/fv3S5IOHTqk0tJSubu7Kzc3V5L03XffSZJ27dql06dP67XXXlNMTIwsFktFeDMa///2DAaDysvL1aJFC3Xt2lWrV6/WypUr1bNnTwUFBVWcAwAAAACO4rQraP369dOkSZP0+OOPq0mTJpKkIUOGaPr06QoMDNRNN90kSWrXrp0WLVqk/v37y93dXc2aNVNOTs5543Xu3FkjRozQqlWr9MUXX2jQoEEqLi5Wjx495OXldUW12e10pQIA4HJYrDZ5uPFdNwBcLoPd7vy7epWUlKhnz5766KOPHF2KJMlkMslsNju6DAAAAABOqqqZwWlfcQQAAACAuqZGBLR69eo5zeoZAAAAAFwvNSKgAQAAAEBdQEADAAAAACdBQKsC52+rAgC4WharzdElAADqIKdos2+z2TRixAhlZmZq7Nix6tOnz3WZ57c90x577LELHk9OTlZAQIAGDhx4yXEMBql53NbrUSIAwEmwnQoAwBGcIqDl5uYqPz9fu3btuq7z3H333dd1fAAAAAC4Gk4R0KZMmaLjx4/rpZdeUps2bdSiRQstWbJEbm5uOnnypCIjI/XMM8/o8OHDmjVrlsrLy1VQUKDJkyerY8eOeuCBB9SxY0cdO3ZM/v7+Sk5OltVqVXx8vE6dOiWr1aopU6bo2LFjOnr0qGJjYzV37lx9++23KioqUmhoqF555RVHPwYAAAAAdZxTBLSXX35ZMTExatiwYcVvp06d0ubNm1VaWqqIiAg988wz+vHHHzVhwgS1atVKW7ZskdlsVseOHZWVlaWVK1cqMDBQAwYM0MGDB/X111+radOmmjdvng4fPqxPP/1UPj4+kqTCwkL5+Pho+fLlKi8vV69evZSdne2o2wcAAAAASU4S0C6kZcuWcnV1laurqzw8PCRJN910kxYtWiQPDw8VFRXJy8tLkuTn56fAwEBJUmBgoEpKSnT06NGKVxpbtmypli1bVuzkXa9ePeXl5SkmJkaenp4qLi6W1Wp1wF0CAAAAwP9z2i6OBoPhvN8SEhL0/PPPa/bs2WrZsqXs/22neKFzQ0NDdfDgQUlSVlaWxo0bV3Hst2Yhr732mmJiYmSxWCrGAgAAAABHcdoVtAt55JFH9Oyzz8rf31+NGzdWfn7+Rc8dMGCAJk6cqMGDB8tms2nixInKzMyUJLVr106LFi1S//795e7urmbNmiknJ+ey67Db6e4FALWdxWqTh5uLo8sAANQxBjtLR1fMZDJVvC4JAAAAAP+rqpnBaV9xBAAAAIC6hoAGAAAAAE6CgAYAAAAAToKABgAAAABOgoBWBbRVAQDnYbHaHF0CAADXTI1qs389xMXFKTIysmJT68thMEjN47Zex6oAAJeLbU8AALUJK2gAAAAA4CRqxQqaxWJRfHy8Tp06JavVqri4OK1du1Znz55Vfn6++vXrp0GDBmnt2rV67733ZDQa1bFjR02YMEGSlJqaqqVLl6qwsFBTp05Vu3btHHxHAAAAAOqiWrGCtn79ejVt2lSpqamaNWuWMjIy1KtXL6WkpGjx4sVasWKFJMlsNmvSpElKTU1Vs2bNVFZWJkkKDw/XqlWrNHjwYDagBgAAAOAwtWIF7ejRoxXfkLVs2VI33nij5s6dq+3bt8vLy6siiL3yyitKSUlRUlKSOnToIPt/u32Eh4dLkgICAmSxWBxzEwAAAADqvFqxghYaGqqDBw9KkrKysjRjxgx16NBBSUlJeuihhyqC2IYNGzRt2jStWbNG33//vb766itJksFgcFjtAAAAAPCbWrGCNmDAAE2cOFGDBw+WzWbTfffdp1WrVmnLli3y9fWVi4uLSktL1apVK/Xt21d+fn5q1KiR2rdvX6VXGu12uoYBgLOwWG3ycHNxdBkAAFwTBrudXb2ulMlk4ls1AAAAABdV1cxQK15xBAAAAIDagIAGAAAAAE6CgAYAAAAAToKABgAAAABOgoBWBbRVAQDHslhtji4BAIDrola02d+7d6/Wr1+vefPmVct8BoPUPG5rtcwFADgfW50AAGorVtAAAAAAwEnUyBW0Y8eOKT4+Xq6urnJxcdGf/vQn/fTTTxo+fLjy8vLUvXt3jR49Wl988YUWLlwoSbJYLJo9e7bc3Nz0zDPPyNfXV3fffbfuvvtuzZw5U5Lk6+urxMREeXt7O/L2AAAAANRRNTKgffrppwoPD1dcXJz27dunI0eOqKSkRIsWLZLNZtM999yj0aNHKzMzU6+++qoaNWqkxYsXa9u2bYqKilJubq7S0tLk7u6u/v37KzExUWFhYdq4caOWLl2qsWPHOvoWAQAAANRBNTKg9e3bV0uWLNHw4cPl7e2tbt266ZZbbpG7u7skydX119tq1KiREhIS5OnpqezsbHXs2FGSFBQUVHHukSNHNG3aNEmS1WpVSEiIA+4IAAAAAGpoQNu5c6c6deqkUaNG6f3339drr72m9u3bn3fe5MmTtWPHDnl5eWnChAmy/7f9otH4/5/ehYSEaPbs2WrSpIm+/PJL5ebmVtt9AAAAAMDv1ciA1rZtW40fP17JyckyGo2Kjo7WgQMHzjuvd+/e6t+/v3x8fBQQEKCcnJzzzpk6daomTJggm+3Xls0JCQmVzm+300EMABzJYrXJw83F0WUAAHDNGex2dvW6UiaTSWaz2dFlAAAAAHBSVc0MtNkHAAAAACdBQAMAAAAAJ0FAAwAAAAAnQUADAAAAACdBQKsC2qoAwK+dFAEAwLVVI9vsVyY9PV3e3t5q3br1dRnfYJCax229LmMDQE3BdiMAAFx7tXIFLS0t7YJ7ngEAAACAM3PqFTSz2ay0tDSVl5crOjpaK1eulNFoVKdOnRQbG6vk5GQFBARo4MCBOnLkSMWm07t371ZGRobCwsL0zTffaMWKFedd99VXX6m4uFgJCQl69dVXVVhYKIvFovHjx6tr166OvnUAAAAAdZBTBzRJ8vHx0SuvvKJBgwYpLS1N9evX1/jx47Vnz54Lnt+2bVtFREQoMjJSnp6eSk5OvuB1LVq00OTJk5WZmamff/5ZK1as0JkzZ3T8+PFqvDsAAAAA+H9OH9BCQkJ04sQJ5eXlacSIEZKkoqIiZWVlVXrtpa4LCQmRJN1yyy16/PHHFRMTo7KyMkVHR1+nOwEAAACAS3P6gGY0GhUUFKTAwEClpKTIzc1NZrNZbdq00e7du5WbmytJysjIqLjGYDDIbrdf9LodO3bIaPz187tDhw6pqKhIb7/9tnJycjRgwAB1797dIfcKAAAAoG5z+oAmSQ0aNNCwYcMUHR0tm82mpk2bqmfPnvLy8tKYMWOUnp6utm3bVpzfvn17JSUlaf78+Re87veaN2+uN954Q++99ydBacUAACAASURBVJ7c3Nz0/PPPV1qP3U73MgCwWG3ycHNxdBkAANQqBrudXb2ulMlkktlsdnQZAAAAAJxUVTNDrWyzDwAAAAA1EQENAAAAAJwEAQ0AAAAAnAQBDQAAAACcBAGtCmirAqCusFhtji4BAIA6pUa02Xc2BoPUPG6ro8sAgOuOLUUAAKherKABAAAAgJOolStoZrNZH3/8sSwWi06cOKGnn35arVu31owZM+Ti4qJ69eppxowZKi8v17hx49S4cWNlZWXp1ltv1bRp0xxdPgAAAIA6qlYGNEkqLCzUsmXLdPz4cY0cOVKenp5KSEhQmzZttGPHDs2aNUsvvviijh8/rmXLlql+/frq0aOHcnNz1bBhQ0eXDwAAAKAOqrWvOLZu3VqSFBgYqNLSUuXk5KhNmzaSpC5duigzM1OSFBwcLC8vL7m4uKhhw4YqKSlxWM0AAAAA6rZaG9AMBsM5f99000364YcfJEnp6elq3rz5Bc8DAAAAAEepta84/q+ZM2dqxowZstvtcnFxUWJiYpXHstvpbAagbrBYbfJwc3F0GQAA1BkGu/3Su3q99957euutt1RaWiq73S6DwaCdO3dWV31OyWQyyWw2O7oMAAAAAE6qqpmh0hW0JUuWaPHixQoMDKxSYQAAAACAy1NpQGvWrJluvvnm6qgFAAAAAOq0SgOah4eHhg8frjZt2lQ01IiJibnuhQEAAABAXVNpQPvjH/9YHXUAAAAAQJ1XaZv9qKgolZWVKSsrS02aNCGw6dcujgBQG1msNkeXAABAnVbpCtrLL7+sm266SZ9++qnatm2rCRMmaMmSJVc80Zo1azR48OCLHj906JAKCgrUpUuXKx67KtLT0+Xt7V2xofWVMBik5nFbr0NVAOBYbCECAIBjVbqCduLECb3wwgtyd3fXvffeq7Nnz1ZpojfffPOSx7dv364ff/yxSmNXRVpamnJycqptPgAAAACoTKUraDabTXl5eTIYDCosLJTRWGmm07FjxxQfHy9XV1e5uLjo9ttv13/+8x9NnTpVsbGxmjRpks6ePav8/Hz169dP9913nzZt2iQ3NzeFh4fLYrFo3rx5cnFxUbNmzTR9+nRt2bJFf//732WxWJSbm6shQ4Zo586dyszM1IsvvqgePXrogw8+0IoVK2Q0GtWpUyfFxsYqOTlZJ0+e1JkzZ3Tq1CnFx8fLz89Pu3fvVkZGhsLCwrRgwQKdOHFCJSUleuqppxQZGXlNHi4AAAAAXIlKA9qYMWM0cOBA5ebm6rHHHtOkSZMqHfTTTz9VeHi44uLitG/fPvn7+2vNmjWaOnWqMjIy1KtXLz3wwAPKzs5WdHS0Bg0apD59+iggIEC33nqrHnroIb3zzjvy9/fX/PnztWnTJrm6uqqoqEgpKSnaunWrVqxYoQ0bNmjv3r1atWqVOnfurOTkZKWlpal+/foaP3689uzZI0lyd3fX0qVLtWfPHqWkpGjZsmWKiIhQZGSkfHx8tHfvXqWlpUlSxTUAAAAAUN0qDWi33XabPvzwQ+Xl5cnPz09ZWVmVDtq3b18tWbJEw4cPl7e3t8aOHVtxLCAgQCtXrtT27dvl5eWlsrKyc67Ny8tTTk6OxowZI0myWCzq1q2bgoOD1aZNG0mSt7e3QkNDZTAYdOONN6qkpEQnTpxQXl6eRowYIUkqKiqqqPW36xo3bqzS0tJz5vPy8tKUKVM0ZcoUFRYW6pFHHqn0/gAAAADgeqg0oI0bN04LFixQgwYNtH79ei1fvlwffvjhJa/ZuXOnOnXqpFGjRun999/X0qVLZf9v68OUlBR16NBBgwYN0ueff66PP/5YkmQwGFReXi4/Pz81btxYixYtkre3t3bu3ClPT0+dPn26Yh+2CwkKClJgYKBSUlLk5uYms9msNm3aaMeOHRe8zmAwyG63KycnRxkZGXrjjTdUUlKiP/7xj+rdu7dcXSt9NAAAAABwTVWaQu644w6NHz9eZ8+elbe3tzZs2FDpoG3bttX48eOVnJwso9Go+Ph4nTx5UrGxserbt6+mTp2qLVu2yNfXVy4uLiotLVXbtm01Z84chYaGatKkSRoxYoTsdrtuuOEGzZkzR6dPn77knA0aNNCwYcMUHR0tm82mpk2bqmfPnhc9v3379kpKStL8+fOVm5urRx99VJ6ennryyScrDWd2O53OANROFqtNHm4uji4DAIA6y2C3X3hXr9+/Crh69Wp99tlnWrRokaRfv+mqy0wmk8xms6PLAAAAAOCkqpoZLrpU9NBDD1W8Gvhbhvvtt507d1axTAAAAADAxVw0oH300UcV/7bb7crPz9eNN94oFxdefQEAAACA66HSTc327t2rHj166Mknn9T9999PG3oAAAAAuE4qbRIyf/58vfPOO2rUqJGys7M1atQodevWrTpqAwAAAIA6pdIVNBcXFzVq1EiS1KhRI9WrV++6FwUAAAAAdVGlK2heXl5avXq1unTpovT0dPn6+lZHXU7twn0vAaBmorU+AADOo9KAduutt+r06dOaP3++WrRooQYNGlRHXU7NYJCax211dBkAcE2wryMAAM7joq84bty4UY899piWL1+uL7/8Uvn5+UpPT9fBgwers74rZjab9cILL+jPf/6zevbsKbPZrEOHDik6OlrR0dEaPXq0zp49q2effbbiXh588EH97W9/kyQ9+eSTys7OduQtAAAAAKijLrqC1rt3b91xxx166623NHLkSEmS0WiUv79/tRVXVYWFhVq2bJmOHz+ukSNHysfHR4mJiQoLC9PGjRu1dOlSPfDAA9q1a5d8fX1Vr1497dmzR7fffrtKSkoqvrkDAAAAgOp00YDm7u6uoKAgzZgxozrruSZat24tSQoMDFRpaamOHDmiadOmSZKsVqtCQkL05JNP6tlnn5Wfn5+efvppLV++XLt27VL37t0dWToAAACAOqzSb9BqIoPBcM7fISEhmj17tpo0aaIvv/xSubm5uvHGG+Xh4aEPPvhAycnJ+vDDD7Vy5UolJSU5qGoAAAAAdV2tDGj/a+rUqZowYYJsNpskKSEhQZJ03333yWw2y9fXV3fddZfeeecdBQcHVzqe3c5H9QBqD7o4AgDgPAx2O03jr5TJZJLZbHZ0GQAAAACcVFUzQ6UbVQMAAAAAqgcBDQAAAACcBAENAAAAAJwEAQ0AAAAAnAQBDQAAAACcRK0MaIcOHVJ6erokaezYsSotLb2m49P3EkBtYLHaHF0CAAD4H7VyH7Tt27crICBAXbp00bx58675+AaD1Dxu6zUfFwCqE/s5AgDgfKo9oFmtVk2cOFFZWVmy2Wx64okntG7dOoWEhOjYsWOy2+2aN2+eGjZsqLlz5yo9PV12u13Dhg1Tz549FR0dLT8/PxUUFCg5OVmTJ0/W2bNnlZ+fr379+um+++7Tpk2b5ObmpvDwcI0ZM0YffPCBXn75Zbm7u+uf//yncnJyNGvWLIWHh2vjxo1au3atbrzxRrm5uSkyMlImk6m6HwsAAAAAVH9AS01NlZ+fn1599VUVFhbKZDLJ3d1df/rTnzR9+nStXbtWb731liIiInTy5EmtX79eJSUl6t+/v7p16yZJioqK0v3336+MjAz16tVLDzzwgLKzsxUdHa1BgwapT58+CggIULt27c6Zu0mTJpo+fbo2bNig1NRUjRkzRkuXLtV7770nd3d3DRkypLofBwAAAABUqPaAduTIEd15552SJC8vL4WGhmrPnj26/fbbJUkdO3bURx99pEaNGikjI0PR0dGSpLKyMp06dUqSFBISIkkKCAjQypUrtX37dnl5eamsrOySc7dp00aS1LhxY+3fv18nTpxQaGio6tevL0m67bbbrv0NAwAAAMBlqvYmIaGhodq3b58kqbCwUIcPH1ZQUJC+/fZbSdL+/fsVFhamFi1aqGvXrlq9erVWrlypnj17KigoSJJkMBgkSSkpKerQoYOSkpL00EMPyf7f7h0Gg0Hl5eXnzf3bdb8JDg7W0aNHZbFYVF5ergMHDly3+wYAAACAylT7Clr//v01ZcoUDRw4UCUlJRo1apTMZrM2bdqkFStWqH79+pozZ458fX31xRdfaNCgQSouLlaPHj3k5eV1zljdu3fX1KlTtWXLFvn6+srFxUWlpaVq27at5syZo9DQ0EvW0qBBAz399NMaNGiQfH19VVJSIlfXyh+J3c7H9QBqPovVJg83F0eXAQAAfsdgtzu+aXx0dLSmTp1aaaC61srKyrRkyRI988wzkqTHH39cY8aMUZcuXS55nclkktlsro4SAQAAANRAVc0MtbLN/uVydXXVL7/8oj59+sjNzU3t2rVT586dHV0WAAAAgDrKKQLa6tWrHTZ3TEyMYmJiHDY/AAAAAPym2puEAAAAAAAujIAGAAAAAE6CgAYAAAAAToKA9jvp6en64YcfKj3P8X0vAeDKWaw2R5cAAAAq4RRNQpxFWlqaIiMj1bp160ueZzBIzeO2VlNVAHBtsH8jAADOzykDmtls1s6dO1VYWKj8/Hw999xz8vPz07x58+Ti4qJmzZpp+vTp2rJli9LS0lReXq7nn39eJ0+e1Lp161ReXq777rtPo0eP1gcffKAVK1bIaDSqU6dOio2NVXJysk6ePKkzZ87o1KlTio+Pl5+fn3bv3q2MjAyFhYWpSZMmjn4MAAAAAOoYpwxoklRcXKzly5crLy9P/fr1k9Fo1IYNG+Tv76/58+dr06ZNcnV1lY+Pj958802dOXNGL7/8sjZv3ix3d3fNmjVLp06dUnJystLS0lS/fn2NHz9ee/bskSS5u7tr6dKl2rNnj1JSUrRs2TJFREQoMjKScAYAAADAIZw2oHXp0kVGo1EBAQGqX7++fvrpJ40ZM0aSZLFY1K1bNwUHByskJESSlJWVpVtuuUUeHh6SpIkTJ+rAgQPKy8vTiBEjJElFRUXKysqSJLVp00aS1LhxY5WWllb37QEAAADAeZw2oGVkZEiSfv75Z5WUlCg4OFiLFi2St7e3du7cKU9PT50+fVpG4699ToKDg3X06FGVlpbK3d1dzz//vCZMmKDAwEClpKTIzc1NZrNZbdq00Y4dO2QwGM6b02AwyE4HEAAAAAAO4rQB7eeff9bQoUN19uxZvfzyyzIajRoxYoTsdrtuuOEGzZkzR6dPn644v0GDBnr66ac1ePBgGQwGde/eXU2bNtWwYcMUHR0tm82mpk2bqmfPnheds3379kpKSlJQUJBCQ0Mvep7dzsf2AGoei9UmDzcXR5cBAAAuwWB3wiUjs9mso0ePKjY21tGlXJDJZJLZbHZ0GQAAAACcVFUzA/ugAQAAAICTcMpXHE0mk6NLAAAAAIBqxwoaAAAAADgJAhoAAAAAOAkCGgAAAAA4iVoV0EpKSrRx48aLHk9PT9cPP/xw0eNms1lJSUmVzuN8fS8B4OIsVpujSwAAAJfJKZuEVFVubq42btyofv36XfB4WlqaIiMj1bp166uax2CQmsdtvaoxAKC6sG8jAAA1R60KaIsXL9aPP/6ohQsX6uDBgyosLJTNZtMLL7wgb29v7d69WxkZGQoLC9NHH32k7du3q6ysTN7e3kpOTnZ0+QAAAADquFoV0EaOHKnDhw+rqKhId955p4YOHars7GwNHDhQO3bsUEREhCIjI9W4cWP9+9//1ooVK2Q0GvXUU0/p4MGDji4fAAAAQB1XqwLab44cOaKoqChJUqNGjeTl5aW8vLyK40ajUW5uboqJiZGnp6f+9a9/qayszFHlAgAAAICkWhbQjEajysvLFRoaqn379ukPf/iDsrOzVVBQIF9fXxkMBtntdv3www/asWOHNm7cqF9++UUmk0l2On8AAAAAcLBaFdD8/f1ltVp19uxZ/fTTT/rwww9lsVg0ffp0ubq6qn379kpKStJrr72m+vXry2Qyyd3dXQ0bNlROTs5lz2O389E9gJrDYrXJw83F0WUAAIDLYLCzdHTFTCaTzGazo8sAAAAA4KSqmhlq1T5oAAAAAFCTEdAAAAAAwEkQ0AAAAADASRDQAAAAAMBJENAAAAAAwEkQ0KqAvpcAnJnFanN0CQAAoIpq1T5o1cVgkJrHbXV0GQBwQezTCABAzVXjA5rFYtGLL76onJwcBQYGKj09Xa+99poWLlxYcXz27NkKCQnR3Llz9e2336qoqEihoaF65ZVXlJycrKNHj+rMmTMqKCjQ5MmT1blzZwffFQAAAIC6qMYHtNTUVAUFBWnBggU6cuSIHn74YWVmZurVV19Vo0aNtHjxYm3btk3R0dHy8fHR8uXLVV5erl69eik7O1uS5OHhoVWrVikzM1Pjxo3T5s2bHXxXAAAAAOqiGh/Qjhw5orvvvluSFBoaqgYNGqhRo0ZKSEiQp6ensrOz1bFjR9WrV095eXmKiYmRp6eniouLZbVaJUm33367JOmWW27Rzz//7LB7AQAAAFC31fgmIS1bttRXX30lSTpx4oTy8/M1efJkJSYmatasWbrppptkt9u1a9cunT59Wq+99ppiYmJksVhk/2+3j4yMDEnS4cOH1ahRI4fdCwAAAIC6rcavoPXt21dxcXF6/PHH1aRJE9WrV0+9e/dW//795ePjo4CAAOXk5GjgwIFatGiR+vfvL3d3dzVr1kw5OTmSpO+//15Dhw7VL7/8ohkzZlQ6p93OR/gAnJfFapOHm4ujywAAAFVQ4wPad999p759++quu+7S8ePH9dVXXyk+Pl7x8fHnnZuWlnbeb59++qkiIyM1cODAy57TYLiqkgHguiKcAQBQc9X4gNasWTPFxMRo4cKFKisr00svveTokgAAAACgSmp8QGvYsKFWr15d5etHjx59DasBAAAAgKqr8U1CAAAAAKC2IKABAAAAgJMgoAEAAACAk3BIQCspKdG999573ecxm81KSkq65uP+d/s0ALgqFqvN0SUAAAAnU+ObhDiCwSA1j9vq6DIA1HDspwgAAP5XtQW0oqIixcbGqqCgQMHBwZKkQ4cOaebMmZIkX19fJSYm6oYbbtDMmTN14MABWa1WjR49Wj169NDcuXOVnp4uu92uYcOGqWfPnoqOjlarVq2UmZkpT09Pde7cWZ988okKCgqUkpIiSfr66681dOhQFRYWavTo0brnnnv0xRdfaN68eXJxcVGzZs00ffp0bdmyRWlpaSovL9fzzz+vO+64o7oeDQAAAABIqsaAtmnTJrVs2VJjx47VN998o71792rKlClKTExUWFiYNm7cqKVLl6pt27bKz8/Xu+++q9zcXK1Zs0Zubm46efKk1q9fr5KSEvXv31/dunWTJLVr106TJ0/WU089JQ8PDy1fvlwTJkxQenq6JKl+/fp6++23lZeXp379+ikiIkJTpkzRO++8I39/f82fP1+bNm2Sq6urfHx89Oabb1bXIwEAAACAc1RbQMvMzFRERIQkqX379nJ1ddWRI0c0bdo0SZLValVISIiOHTumDh06SPp1j7OxY8dqyZIlysjIUHR0tCSprKxMp06dkiSFh4dLknx8fBQWFlbx75KSEklSp06dZDAY5O/vL29vb+Xn5ysnJ0djxoyRJFksFnXr1k3BwcEKCQmppqcBAAAAAOertoDWokULff311+rRo4e+++47lZWVKSQkRLNnz1aTJk305ZdfKjc3V66urtq2bZsk6ezZsxozZowGDRqkrl27asaMGSovL9eiRYsUFBR0WfMePHhQkpSbm6vi4mL5+fmpcePGWrRokby9vbVz5055enrq9OnTMhppagkAAADAcaotoD3++OOKj4/XwIED1aJFC7m5uWnq1KmaMGGCbLZfO5klJCSoefPm+uyzzzRw4EDZbDY999xzuvvuu/XFF19o0KBBKi4uVo8ePeTl5XVZ81osFg0ZMkTFxcWaPn26XFxcNGnSJI0YMUJ2u1033HCD5syZo9OnT1/2vdjtfNwP4OpZrDZ5uLk4ugwAAOBEDHY7TeOvlMlkktlsdnQZAAAAAJxUVTMD7/QBAAAAgJMgoAEAAACAkyCgAQAAAICTIKABAAAAgJMgoAEAAACAk6hTAS06OlpHjhzRv//9b23ZsqXK49D3EsDVslhtji4BAAA4oWrbB82ZHDp0SB999JGioqKqdL3BIDWP23qNqwJQl7CXIgAAuBCnD2hms1lpaWkqLy9XdHS0Vq5cKaPRqE6dOik2NlZffvmlZs+eLVdXV/n4+CgpKUnbt2/X0aNHFRsbq5KSEvXs2VMfffRRxZiLFy/WDz/8oNTUVPn5+WnJkiVydXVV06ZNNWfOHBmNdWphEQAAAICTqBFJxMfHR2+++aYWLlyoFStWaN26dcrOztaePXu0Y8cO3X///VqzZo369u2rgoKCSscbOXKkbr/9dj322GN6//33NWzYMK1bt0533XWXCgsLq+GOAAAAAOB8NSKghYSE6MSJE8rLy9OIESMqviXLysrSyJEjlZeXp6FDh2rbtm1ydT13UdBeyQdj8fHxSk9P1+DBg7V//35WzwAAAAA4TI1II0ajUUFBQQoMDFRKSopWr16twYMHq3379tqyZYv69Omj1atX65ZbbtGGDRtUr1495ebmSpIyMjIuOF55ebkkKTU1VaNHj9aaNWskSX/729+q78YAAAAA4Hec/hu03zRo0EDDhg1TdHS0bDabmjZtqp49e6q0tFRxcXHy9PSUm5ubpk+frhtvvFHr1q3TwIEDFR4erhtuuOGcsYKDg3X48GGtWLFC7dq10xNPPCFfX1/dcMMNuueeeyqtxW7nA38AV8ditcnDzcXRZQAAACdjsFf2DiDOYzKZZDabHV0GAAAAACdV1cxQI15xBAAAAIC6gIAGAAAAAE6CgAYAAAAAToKABgAAAABOgoAGAAAAAE6i1ge0kpISbdy48ZqOSd9LAJfDYrU5ugQAAFDD1Jh90KoqNzdXGzduVL9+/a7ZmAaD1Dxu6zUbD0DtxH6JAADgStX6FbTFixfrxx9/1MKFCzVy5Eg98cQT6tOnj3bs2KHCwkJFRkbq/9q7/7Aq6/uP46/7AP7i4G9U1FzgYukcc2T6B8y0MX/MzMalXs2BOfMHWDrQCn/EJQppJNbSppe5uEb+QJkyu2aXadu1Yl2ZOYuhTiVc2YBEU7rmAYFzOJ/vHxXLoqV+49wHeD7+4tznvu/z/ry5Psrruu/zuc+cOaOysjJNnjxZLpfL7pIBAAAAtFNt/gpaUlKSSktLFR0drTvvvFOjRo3SO++8o40bNyouLk5PPvmk0tPTZYzRU089JafTaXfJAAAAANqpNh/QPhcaGqrNmzdrz549sixLHo9HkhQVFaWQkBAFBQVpyJAhNlcJAAAAoD1r87c4OhwOeb1ePfvss5oyZYrWrVunUaNGyXy20scrr7yi4OBgBQYG6pVXXrG5WgAAAADtWZu/gtarVy+53W699957euKJJ7RlyxaFhYWpurpaFRUVevbZZ7Vjxw4ZYzRjxgz94Ac/0IABA/7nOY3hy/8Avlmdu1GdggLsLgMAALQibT6gdezYUS+99NLXvn/gwIGmnw8ePHhd57Ss/3dZANoBwhkAALhRbf4WRwAAAABoLQhoAAAAAOAnCGgAAAAA4CcIaAAAAADgJwhoAAAAAOAn2nxAi4mJue59p0+frvLy8m/c77NHqAGApE+X0wcAAPg2tPll9luCZUm3Ln3Z7jIA+AmeiwgAAL4trSagud1urVy5UufOnZPX61VKSoqysrI0cuRInTlzRpZladOmTerSpYvS09NVVlamW265RQ0NDZKkjz76SOnp6aqvr1fHjh2VmZmpsLAwPfPMM/rb3/6mfv36qbq62uZRAgAAAGjPWk1A+8Mf/qAePXpozZo1qq6uVkJCgq5evapJkyYpPT1dS5YsUVFRkbp06aL6+noVFBSosrKy6eHT2dnZSkxM1F133aXDhw8rJydH8+fP19GjR7Vnzx7V1tZq3LhxNo8SAAAAQHvWagJaaWmpjh07ppKSEkmSx+NRdXW1hg4dKkkKCwtTfX29KioqFBUVJUnq37+/wsLCmo7fsmWLfve738kYo6CgIJWVlWnYsGFyOBxyOp2KjIy0Z3AAAAAAoFYU0CIiItSvXz8lJSWprq5Omzdv1ksvvSTLsr6y38svv6wHHnhAVVVVqqqqato+e/ZsRUdH6+zZszp69KjCw8P14osvyuv1qq6uTmVlZXYMDQAAAAAktaKAdv/99+vxxx9XQkKCXC6XZsyYIYfjq4tQxsXF6dixY5o2bZr69++vHj16SJLS0tKUkZGh+vp61dXVacWKFRoyZIgmTJigqVOnqk+fPurVq9d11WIMiwIA+K86d6M6BQXYXQYAAGgDLGNYNP5GxcfHq7Cw0O4yAAAAAPipm80Mbf45aAAAAADQWhDQAAAAAMBPENAAAAAAwE8Q0AAAAADATxDQAAAAAMBPtPuAFhMTc8PHsO4lgDp3o90lAACANqjVPAfNn1iWdOvSl+0uA4CNeBYiAABoCa0moBUWFuqvf/2r6urqdPHiRc2cOVN/+ctf9N577+mxxx7T+fPndejQIXk8HoWEhGjjxo3av3+/9u7dK6/Xq0WLFqm8vFz5+fnyer36yU9+ooULF6qhoUFLlixRZWWlunfvrg0bNigoKMju4QIAAABoh1rVLY41NTXaunWr5s6dq/z8fD333HNavXq19uzZo08++US///3vtXPnTnk8Hh0/flyS1LVrV+Xn5ysyMlJbt27Vzp07VVhYqCtXrqimpka1tbVKTU1Vfn6+XC6XTp06ZfMoAQAAALRXreYKmiQNGTJEkhQSEqLBgwfLsix169ZNbrdbQUFBWrx4sbp06aLz58/L4/FIksLDwyVJ//73v3XbbbepU6dOkqTly5dLkrp166aBAwdKknr37q2rV6/6elgAAAAAIKmVXUGzLKvZ7W63W3/+85/1m9/8Runp6fJ6vTKfuXBA2QAADY5JREFUreThcHw6xEGDBulf//qXGhoaJEmLFi1SVVXV154TAAAAAHytVV1B+zqBgYHq3Lmz4uPj1aFDB4WGhurChQvX7NOzZ0/NnTtXCQkJsixLY8eOVd++fW/q84xhgQCgvatzN6pTUIDdZQAAgDbGMoZF429UfHy8CgsL7S4DAAAAgJ+62czQqm5xBAAAAIC2jIAGAAAAAH6CgAYAAAAAfoKABgAAAAB+goAGAAAAAH7CtoBWWFionJycmzr24sWLysjI+HYLugGsewm0H3XuRrtLAAAA7UirfA5aaGiorQHNsqRbl75s2+cD8B2eeQgAAHzJ1oBWXFysBx54QC6XSwsXLtTq1at14MABdezYUTk5OYqIiNCYMWOUkpIiY4zcbrdWrVql4OBgLV68WAUFBZo8ebJGjhypM2fOyLIsbdq0SSEhIVq/fr2OHj0qY4xmzZqliRMnaseOHdq3b58cDoeio6OVlpamQ4cOaevWrQoMDNSAAQP01FNPyeHgzk8AAAAAvmdrQOvcubOef/55Xb58WdOmTZPX6/3KPiUlJU2Bq6ysTC6XS8HBwU3v19TUaNKkSUpPT9eSJUtUVFQkp9Op8vJy7dq1S/X19Zo+fbpiYmJUWFio9PR0DR8+XDt37pTH49H+/fs1a9YsTZo0Sfv27ZPL5VLXrl192QYAAAAAkGTzIiF33HGHLMtSr169FBISok8++aTpPfPZF71Gjx6tO++8UwsWLNCGDRuavbo1dOhQSVJYWJjq6+tVWlqqkydPKjExUXPmzJHH41FlZaXWrl2rXbt2KSEhQZWVlTLGaNmyZTp69KgSEhL0zjvvcPUMAAAAgG1sTSPHjx+X9OmiH7W1terbt68uXLggY4xOnz4tSTpy5Ij69Omj3NxcJScn6+mnn/7KeSzLuuZ1RESERo0apW3btikvL08TJ07UwIEDVVBQoFWrVmn79u06deqU3n33Xe3evVsLFy7U9u3bJUmvvvpqC48aAAAAAJpn6y2OdXV1mjlzpmpra7V69WpVVFRo3rx5GjBgQNNthrfffrtSU1OVl5cnh8Ohhx566BvPe/fdd+vtt9/WjBkzVFtbq7i4ODmdTn3ve9/T1KlT1aNHD/Xt21c//OEP5XK59Ktf/Urdu3dXcHCwxowZ843nN4aFA4D2os7dqE5BAXaXAQAA2gnLGBaNv1Hx8fEqLCy0uwwAAAAAfupmMwNfuAIAAAAAP0FAAwAAAAA/QUADAAAAAD9BQAMAAAAAP0FAAwAAAAA/4fcBLT8/Xxs3btTFixeVkZFxw8fv3r1bbrf7hj7rm7DuJdC21Lkb7S4BAABAks3PQbsRoaGhNxXQtmzZovvuu+9brcWypFuXvvytnhOAfXiuIQAA8Be2BzSXy6UVK1boypUrqq6u1rRp0xQZGak1a9aoW7ducjgcGj58uMrLy7V48WIVFBTo7rvv1oEDB9SxY0fl5OQoIiJCY8aMUUpKiowxcrvdWrVqlUpKSnTx4kWlpqZq06ZNWr9+vY4ePSpjjGbNmqWJEyfq73//+1c+CwAAAADsYHtAO3funCZNmqRx48apqqpKiYmJCgkJ0fr16xUeHq6VK1de13lKSkqajisrK5PL5dK0adO0efNmPfPMM3r99ddVXl6uXbt2qb6+XtOnT1dMTIzWrl17w58FAAAAAC3B9oDWu3dv5eXl6dChQ3I6nfJ4PKqqqlJ4eLgkKTo6Wh9++OHXHm8++0LY6NGj9cEHH2jBggUKDAxUcnLyNfuVlpbq5MmTSkxMlCR5PB5VVlbe0GcBAAAAQEuyfZGQ3NxcDR8+XDk5OZowYYKMMQoNDdXZs2clScePH//KMR06dNCFCxdkjNHp06clSUeOHFGfPn2Um5ur5ORkPf3005Iky7Lk9XoVERGhUaNGadu2bcrLy9PEiRM1cODAb/wsAAAAAPAV26+gjR07VhkZGfrTn/6k7t27KyAgQGvXrlVaWpqCg4MVHBysbt26XXPMnDlzNG/ePA0YMEBdu3aVJN1+++1KTU1VXl6eHA6HHnroIUnSiBEjNG/ePL344ot6++23NWPGDNXW1iouLk5Op1Pr1q37n58FAAAAAL5iGdM6Fo3/4IMPtGLFCu3YscPuUvTzn8frj38stLsMAN+SOnejOgUF2F0GAABoQ+Lj41VYeOOZwfZbHK/H+fPntWTJEsXFxdldiqRPl9kH0HYQzgAAgL+w/RbH69GvXz/t3bvX7jIAAAAAoEW1ioDmbyoqKhQfH293GQAAAAD8VEVFxU0d12q+gwYAAAAAbV2r+A4aAAAAALQHBDQAAAAA8BMENAAAAADwEwQ0AAAAAPATBDQAAAAA8BMss3+dvF6vMjIydObMGXXo0EFZWVn6zne+Y3dZbcJ9992nkJAQSdLAgQOVlJSkpUuXyrIs3XbbbVq5cqUcDocKCgq0a9cuBQYGKjk5WWPHjlVdXZ0effRRXbp0ScHBwcrOzlbPnj1VXFysJ554QgEBAYqNjdXDDz9s8yj9zz/+8Q/l5ORo27ZtOnfuXIv1/LnnntNrr72mwMBALV++XFFRUTaP3D98sf8nT55UUlKSbr31VknSL37xC/3sZz+j/y3E7XZr+fLlqqioUENDg5KTk/Xd736XOeAjzfW/X79+zAEfamxs1OOPP673339fAQEBWrt2rYwxzAEfaa7/V65cYQ7Y4NKlS4qPj1dubq4CAwP9Zw4YXJeDBw+atLQ0Y4wx7777rklKSrK5orahrq7OTJky5Zpt8+fPN2+99ZYxxpj09HRz6NAhc+HCBXPPPfeY+vp685///Kfp59zcXLNhwwZjjDH79+83mZmZxhhj7r33XnPu3Dnj9XrNnDlzzIkTJ3w7MD/3/PPPm3vuucdMmzbNGNNyPT9x4oRJTEw0Xq/XVFRUmPj4eHsG7Ge+3P+CggLzwgsvXLMP/W85e/bsMVlZWcYYYy5fvmzuuusu5oAPNdd/5oBvvfrqq2bp0qXGGGPeeustk5SUxBzwoeb6zxzwvYaGBrNgwQIzbtw4U1ZW5ldzgFscr9OxY8f04x//WJI0fPhwnThxwuaK2obTp0/r6tWrmj17tmbOnKni4mKdPHlSI0eOlCSNHj1ab775pkpKSvSjH/1IHTp0UEhIiAYNGqTTp09f83sZPXq0Dh8+LJfLpYaGBg0aNEiWZSk2NlaHDx+2c5h+Z9CgQdq4cWPT65bq+bFjxxQbGyvLstS/f381Njbq8uXLtozZn3y5/ydOnNBrr72mX/7yl1q+fLlcLhf9b0ETJkzQr3/966bXAQEBzAEfaq7/zAHfiouLU2ZmpiSpsrJSvXv3Zg74UHP9Zw74XnZ2tu6//3716dNHkn/9LURAu04ul0tOp7PpdUBAgDwej40VtQ2dOnXSgw8+qBdeeEGrVq3SI488ImOMLMuSJAUHB+vKlStyuVxNt0F+vt3lcl2z/Yv7fvF39fl2/Nf48eMVGPjfO5xbquf8Lpr35f5HRUXpscce044dO3TLLbfot7/9Lf1vQcHBwXI6nXK5XFq0aJFSUlKYAz7UXP+ZA74XGBiotLQ0ZWZmavz48cwBH/ty/5kDvlVYWKiePXs2hSzJv/4WIqBdJ6fTqZqamqbXXq/3mj+wcHPCw8N17733yrIshYeHq3v37rp06VLT+zU1NeratetX+l9TU6OQkJBrtv+vfbt27eq7QbVCDsd//yn4Nnv+defAtX76059q2LBhTT//85//pP8t7KOPPtLMmTM1ZcoUTZ48mTngY1/uP3PAHtnZ2Tp48KDS09NVX1/ftJ054Btf7H9sbCxzwIf27t2rN998U4mJiTp16pTS0tKuuapl9xwgoF2n6OhoFRUVSZKKi4sVGRlpc0Vtw549e/Tkk09KkqqqquRyuRQTE6MjR45IkoqKijRixAhFRUXp2LFjqq+v15UrV3T27FlFRkYqOjpar7/+etO+d9xxh5xOp4KCgvThhx/KGKM33nhDI0aMsG2MrcHQoUNbpOfR0dF644035PV6VVlZKa/Xq549e9o5VL/04IMPqqSkRJJ0+PBhff/736f/Lejjjz/W7Nmz9eijj2rq1KmSmAO+1Fz/mQO+tW/fPm3ZskWS1LlzZ1mWpWHDhjEHfKS5/j/88MPMAR/asWOHtm/frm3btmnIkCHKzs7W6NGj/WYOWMYY0+JdaAM+X8WxtLRUxhitWbNGgwcPtrusVq+hoUHLli1TZWWlLMvSI488oh49eig9PV1ut1sRERHKyspSQECACgoKtHv3bhljNH/+fI0fP15Xr15VWlqaLl68qKCgIK1fv16hoaEqLi7WmjVr1NjYqNjYWKWmpto9VL9TXl6uxYsXq6CgQO+//36L9Xzjxo0qKiqS1+vVsmXLCMuf+WL/T548qczMTAUFBal3797KzMyU0+mk/y0kKytLBw4cUERERNO2FStWKCsrizngA831PyUlRevWrWMO+Ehtba2WLVumjz/+WB6PR3PnztXgwYP5f8BHmut/WFgY/w/YJDExURkZGXI4HH4zBwhoAAAAAOAnuMURAAAAAPwEAQ0AAAAA/AQBDQAAAAD8BAENAAAAAPwEAQ0AAAAA/AQBDQAAAAD8BAENAAAAAPwEAQ0AAAAA/MT/AV/Nce4HdtFWAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 1008x432 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "most_common.head(25).set_index('token')['count'].sort_values().plot.barh(title='Most Common Tokens', figsize=(14, 6));"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Inspect Result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:59:06.408322Z",
     "start_time": "2020-06-21T02:59:06.405265Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['duluth allete inc',\n",
       " 'nyse ale today reported earnings share net income million operating revenue billion',\n",
       " 'results share net income million operating revenue billion']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_articles[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Create n-grams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T02:59:06.417761Z",
     "start_time": "2020-06-21T02:59:06.410047Z"
    }
   },
   "outputs": [],
   "source": [
    "max_length = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:08:37.112665Z",
     "start_time": "2020-06-21T02:59:06.418792Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 3 \n",
      "\tDuration:  00:09:31\n",
      "\tngrams: 43,678\n",
      "\n",
      "length\n",
      "2    26965\n",
      "3    16713\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "n_grams = pd.DataFrame()\n",
    "start = time()\n",
    "sentences = LineSentence(clean_article_path.as_posix())\n",
    "for n in range(2, max_length + 1):\n",
    "    print(n, end=' ')\n",
    "    if n>2:\n",
    "        sentences = LineSentence((results_path / f'articles_{n-1}_grams.txt').as_posix())\n",
    "    phrases = Phrases(sentences, threshold=100, min_count=10)\n",
    "\n",
    "    s = pd.Series({k.decode('utf-8'): v for k,\n",
    "                   v in phrases.export_phrases(sentences)}) \n",
    "    s = s.to_frame('score').reset_index().rename(\n",
    "        columns={'index': 'phrase'}).assign(length=n)\n",
    "\n",
    "    n_grams = pd.concat([n_grams, s])\n",
    "    grams = Phraser(phrases)\n",
    "    sentences = grams[sentences]\n",
    "\n",
    "    with (results_path / f'articles_{n}_grams.txt').open('w') as f:\n",
    "        for sentence in sentences:\n",
    "            f.write(' '.join(sentence) + '\\n')\n",
    "\n",
    "n_grams = n_grams.sort_values('score', ascending=False)\n",
    "n_grams.phrase = n_grams.phrase.str.replace('_', ' ')\n",
    "n_grams['ngram'] = n_grams.phrase.str.replace(' ', '_')\n",
    "\n",
    "print('\\n\\tDuration: ', format_time(time() - start))\n",
    "print(f'\\tngrams: {len(n_grams):,d}\\n')\n",
    "print(n_grams.groupby('length').size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:08:37.144049Z",
     "start_time": "2020-06-21T03:08:37.113775Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>phrase</th>\n",
       "      <th>score</th>\n",
       "      <th>length</th>\n",
       "      <th>ngram</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>length</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"10\" valign=\"top\">2</th>\n",
       "      <th>23317</th>\n",
       "      <td>hidradenitis suppurativa</td>\n",
       "      <td>182,360.93</td>\n",
       "      <td>2</td>\n",
       "      <td>hidradenitis_suppurativa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17916</th>\n",
       "      <td>citigate dewe</td>\n",
       "      <td>182,360.93</td>\n",
       "      <td>2</td>\n",
       "      <td>citigate_dewe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13440</th>\n",
       "      <td>mukhammadsharif mamatkulov</td>\n",
       "      <td>182,360.93</td>\n",
       "      <td>2</td>\n",
       "      <td>mukhammadsharif_mamatkulov</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11037</th>\n",
       "      <td>pracha hariraksapitak</td>\n",
       "      <td>182,360.93</td>\n",
       "      <td>2</td>\n",
       "      <td>pracha_hariraksapitak</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24230</th>\n",
       "      <td>navesh chitrakar</td>\n",
       "      <td>181,947.41</td>\n",
       "      <td>2</td>\n",
       "      <td>navesh_chitrakar</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25079</th>\n",
       "      <td>nidaa tounes</td>\n",
       "      <td>181,947.41</td>\n",
       "      <td>2</td>\n",
       "      <td>nidaa_tounes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12947</th>\n",
       "      <td>koustav samanta</td>\n",
       "      <td>181,947.41</td>\n",
       "      <td>2</td>\n",
       "      <td>koustav_samanta</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18997</th>\n",
       "      <td>axalimogene filolisbac</td>\n",
       "      <td>181,947.41</td>\n",
       "      <td>2</td>\n",
       "      <td>axalimogene_filolisbac</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14914</th>\n",
       "      <td>shilpa jamkhandikar</td>\n",
       "      <td>181,855.77</td>\n",
       "      <td>2</td>\n",
       "      <td>shilpa_jamkhandikar</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10972</th>\n",
       "      <td>krispy kreme</td>\n",
       "      <td>181,855.77</td>\n",
       "      <td>2</td>\n",
       "      <td>krispy_kreme</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"10\" valign=\"top\">3</th>\n",
       "      <th>6433</th>\n",
       "      <td>dana reizniece ozola</td>\n",
       "      <td>189,347.18</td>\n",
       "      <td>3</td>\n",
       "      <td>dana_reizniece_ozola</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13583</th>\n",
       "      <td>daniela palmieri jessica fitzgerald</td>\n",
       "      <td>189,347.18</td>\n",
       "      <td>3</td>\n",
       "      <td>daniela_palmieri_jessica_fitzgerald</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11954</th>\n",
       "      <td>hgst sandisk tegile upthere</td>\n",
       "      <td>189,347.18</td>\n",
       "      <td>3</td>\n",
       "      <td>hgst_sandisk_tegile_upthere</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10424</th>\n",
       "      <td>carson elder beerman herberger</td>\n",
       "      <td>189,347.18</td>\n",
       "      <td>3</td>\n",
       "      <td>carson_elder_beerman_herberger</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7974</th>\n",
       "      <td>freshfields bruckhaus deringer</td>\n",
       "      <td>189,347.18</td>\n",
       "      <td>3</td>\n",
       "      <td>freshfields_bruckhaus_deringer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5611</th>\n",
       "      <td>writing fransiska nangoy</td>\n",
       "      <td>189,347.18</td>\n",
       "      <td>3</td>\n",
       "      <td>writing_fransiska_nangoy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13342</th>\n",
       "      <td>strategist janney montgomery scott</td>\n",
       "      <td>189,347.18</td>\n",
       "      <td>3</td>\n",
       "      <td>strategist_janney_montgomery_scott</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4506</th>\n",
       "      <td>spokesman baik tae hyun</td>\n",
       "      <td>188,917.82</td>\n",
       "      <td>3</td>\n",
       "      <td>spokesman_baik_tae_hyun</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9167</th>\n",
       "      <td>hormone transcon pth transcon</td>\n",
       "      <td>188,917.82</td>\n",
       "      <td>3</td>\n",
       "      <td>hormone_transcon_pth_transcon</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10597</th>\n",
       "      <td>castello di casole</td>\n",
       "      <td>188,917.82</td>\n",
       "      <td>3</td>\n",
       "      <td>castello_di_casole</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           phrase      score  length  \\\n",
       "length                                                                 \n",
       "2      23317             hidradenitis suppurativa 182,360.93       2   \n",
       "       17916                        citigate dewe 182,360.93       2   \n",
       "       13440           mukhammadsharif mamatkulov 182,360.93       2   \n",
       "       11037                pracha hariraksapitak 182,360.93       2   \n",
       "       24230                     navesh chitrakar 181,947.41       2   \n",
       "       25079                         nidaa tounes 181,947.41       2   \n",
       "       12947                      koustav samanta 181,947.41       2   \n",
       "       18997               axalimogene filolisbac 181,947.41       2   \n",
       "       14914                  shilpa jamkhandikar 181,855.77       2   \n",
       "       10972                         krispy kreme 181,855.77       2   \n",
       "3      6433                  dana reizniece ozola 189,347.18       3   \n",
       "       13583  daniela palmieri jessica fitzgerald 189,347.18       3   \n",
       "       11954          hgst sandisk tegile upthere 189,347.18       3   \n",
       "       10424       carson elder beerman herberger 189,347.18       3   \n",
       "       7974        freshfields bruckhaus deringer 189,347.18       3   \n",
       "       5611              writing fransiska nangoy 189,347.18       3   \n",
       "       13342   strategist janney montgomery scott 189,347.18       3   \n",
       "       4506               spokesman baik tae hyun 188,917.82       3   \n",
       "       9167         hormone transcon pth transcon 188,917.82       3   \n",
       "       10597                   castello di casole 188,917.82       3   \n",
       "\n",
       "                                            ngram  \n",
       "length                                             \n",
       "2      23317             hidradenitis_suppurativa  \n",
       "       17916                        citigate_dewe  \n",
       "       13440           mukhammadsharif_mamatkulov  \n",
       "       11037                pracha_hariraksapitak  \n",
       "       24230                     navesh_chitrakar  \n",
       "       25079                         nidaa_tounes  \n",
       "       12947                      koustav_samanta  \n",
       "       18997               axalimogene_filolisbac  \n",
       "       14914                  shilpa_jamkhandikar  \n",
       "       10972                         krispy_kreme  \n",
       "3      6433                  dana_reizniece_ozola  \n",
       "       13583  daniela_palmieri_jessica_fitzgerald  \n",
       "       11954          hgst_sandisk_tegile_upthere  \n",
       "       10424       carson_elder_beerman_herberger  \n",
       "       7974        freshfields_bruckhaus_deringer  \n",
       "       5611              writing_fransiska_nangoy  \n",
       "       13342   strategist_janney_montgomery_scott  \n",
       "       4506               spokesman_baik_tae_hyun  \n",
       "       9167         hormone_transcon_pth_transcon  \n",
       "       10597                   castello_di_casole  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n_grams.groupby('length').apply(lambda x: x.nlargest(10, 'score'))"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "47px",
    "left": "1227px",
    "top": "40px",
    "width": "212px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
